#install.packages("rvest") #allows us to parse HTML content and extract the HTML elements from it.
#install.packages("xml2")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(xml2)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(ggplot2)
#
# # Data Wrangling for every X1 attribute; since the data is in vertical format it is necessary to make it in record format for further analysis
# reviews_final_data <- data.frame(matrix(ncol = 13, nrow = 3000), stringsAsFactors = FALSE)
# colnames(reviews_final_data) <- c('Aircraft', 'Type Of Traveller', 'Seat Type', 'Route','Date Flown', 'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money', 'Recommended')
# ir = 1
# ic = 1
# for(i in 1:nrow(provisional_data)){
# if(ic >= 14){
# ir = ir+1
# ic <- 1
# }
# if(ic==1){
# if(grepl("Aircraft", provisional_data$X1[i] == TRUE)){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# } else{
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 2){
# if(grepl("Type Of Traveller", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 3){
# if(grepl("Seat Type", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# } else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 4){
# if(grepl("Route", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 5){
# if(grepl("Date Flown", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 6){
# if(grepl("Seat Comfort", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 7){
# if(grepl("Cabin Staff Service", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 8){
# if(grepl("Food & Beverages", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 9){
# if(grepl("Inflight Entertainment", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 10){
# if(grepl("Ground Service", provisional_data$X1[i]) == TRUE){
# reviews_final_data[ir,ic] = provisional_data$X2[i]
# i = i + 1
# }else {
# reviews_final_data[ir,ic] = "9999"
# }
# ic = ic + 1
# }
# if(ic == 11){
# if(grepl("Wifi & Connectivity", provisional_data$X1[i]) == TRUE){
# # reviews_final_data[ir,ic] = provisional_data$X2[i]
# # i = i + 1
# # }else {
# # reviews_final_data[ir,ic] = "9999"
# # }
# # ic = ic + 1
# # }
# # if(ic == 12){
# # if(grepl("Value For Money", provisional_data$X1[i]) == TRUE){
# # reviews_final_data[ir,ic] = provisional_data$X2[i]
# # i = i + 1
# # }else {
# # reviews_final_data[ir,ic] = "9999"
# # }
# # ic = ic + 1
# # }
# # if (ic == 13){
# # if(grepl("Recommended", provisional_data$X1[i]) == TRUE){
# # reviews_final_data[ir,ic] = provisional_data$X2[i]
# # i = i + 1
# # }else {
# # reviews_final_data[ir,ic] = "9999"
# # }
# # ic = ic + 1
# # }
# #
# # }
#
# # Define the column order based on column names
# column_order <- c("Aircraft", "Type Of Traveller", "Seat Type", "Route",
# "Date Flown", "Seat Comfort", "Cabin Staff Service",
# "Food & Beverages", "Inflight Entertainment", "Ground Service",
# "Wifi & Connectivity", "Value For Money", "Recommended")
#
# # Initialize the output data frame with 9999 values
# reviews_final_data <- data.frame(matrix(rep("9999", length(column_order) * nrow(provisional_data)),
# nrow = nrow(provisional_data),
# ncol = length(column_order),
# dimnames = list(NULL, column_order)))
#
# # Loop through the input data and fill in the output data frame
# ic <- 1
# ir <- 1
# for (i in 1:nrow(provisional_data)) {
# if (ic > length(column_order)) {
# ir <- ir + 1
# ic <- 1
# }
# if (grepl(column_order[ic], provisional_data$X1[i])) {
# reviews_final_data[ir, ic] <- provisional_data$X2[i]
# ic <- ic + 1
# }
# }
# Code iteration perfected
# Initialize reviews_final_data with "9999"
reviews_final_data <- matrix(ncol = 13, nrow = 2159)
# Loop through each row of provisional_data and populate reviews_final_data
ir <- 1
ic <- 1
for(i in 1:nrow(provisional_data)){
# Check if ic is greater than or equal to 14
if(ic >= 14){
ir <- ir + 1
ic <- 1
}
# Populate the corresponding element of reviews_final_data based on the column header
if(provisional_data$X1[i] == "Aircraft"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 1] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Type Of Traveller"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 2] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Seat Type"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 3] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Route"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 4] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Date Flown"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 5] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Seat Comfort"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 6] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Cabin Staff Service"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 7] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Food & Beverages"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 8] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Inflight Entertainment"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 9] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Ground Service"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 10] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Wifi & Connectivity"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 11] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Value For Money"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 12] <- provisional_data$X2[i]
} else if(provisional_data$X1[i] == "Recommended"){
if(ir > 2159)
{
break
}
reviews_final_data[ir, 13] <- provisional_data$X2[i]
}
# Increment ic
ic <- ic + 1
}
# Removing the columns since they do not make any sense, we have separate star ratings columns that have been implemented below.
reviews_final_data <- reviews_final_data[,-c(6:12)]
reviews_final_data <- data.frame(reviews_final_data)
colnames(reviews_final_data) <- c("Aircraft", "Type.Of.Traveller", "Seat.Type", "Route", "Date Flown", "Recommended")
#Data Wrangling for Star Ratings
revised_ratings <- data.frame(matrix(ncol = 7, nrow = 2159))
colnames(revised_ratings) <- c('Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money')
ir = 1
ic = 1
for(i in 1:nrow(final_ratings)) { # for-loop over columns
revised_ratings[ir, ic] <- final_ratings[i, 1]
if (i%%7==0) {
ir <- ir+1
ic <- 1
}
else if(ir == 2160){
break
} else {
ic <- ic+1
}
}
revised_ratings <- revised_ratings[-2160,]
# Combining the three data frames (signifies other attributes apart from star ratings), revised_ratings (signifies star ratings), and text_reviews_till_page_10 (Signifies customer reviews)
reviews <- cbind(reviews_final_data, revised_ratings, text_reviews)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
# Adding a new column to calculate the total score
ratings_cols <- c("Seat Comfort", "Cabin Staff Service", "Food & Beverages",
"Inflight Entertainment", "Ground Service", "Wifi & Connectivity", "Value For Money")
# Add up the values in the five columns and calculate the mean
reviews <- reviews %>% mutate(`Total Rating` = rowSums(reviews[, ratings_cols])/7)
# The reviews column has a shabby format, for example the review has the verification before the review starts, we can separately create a verification column to check the integrity of the review
reviews <- reviews %>% mutate(Verification = ifelse(grepl("✅ Trip Verified", Reviews), "Trip Verified", ifelse(grepl("Not Verified", Reviews), "Not Verified", NA)))
# Remove the substring from the original column
reviews$Reviews <- gsub("✅ Trip Verified \\| ", "", reviews$Reviews)
reviews$Reviews <- gsub("✅ Trip Verified |", "", reviews$Reviews)
reviews$Reviews <- gsub("Not Verified \\|","", reviews$Reviews)
reviews$Reviews <- gsub("|","",reviews$Reviews)
# We can break down the "Route" column into two parts: "Origin" and "Destination"
library(tidyr)
Route_updated <- data.frame(reviews$Route)
# separate "Route" column into "Origin location" and "Destination"
Route_updated <- separate(Route_updated, col = reviews.Route, into = c("Origin", "Destination"), sep = c(" to "))
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [301, 1471].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1337, 1752, 1941,
## 1976].
head(Route_updated)
## Origin Destination
## 1 New York Rome
## 2 Los Angeles Melbourne
## 3 Cape Town Atlanta
## 4 San Juan Paris via New York JFK
## 5 Ft Lauderdale Atlanta
## 6 Ft Lauderdale Atlanta
reviews <- reviews[,-4]
reviews <- cbind(reviews, Route_updated)
# Formatting the Date Flown column in the Data frame.
class(reviews$`Date Flown`)
## [1] "character"
library(lubridate)
# Convert Date Flown to MM-YY format
reviews$`Date Flown` <- dmy(paste0("01-", reviews$`Date Flown`))
## Warning: 178 failed to parse.
reviews$`Date Flown` <- format(reviews$`Date Flown`, "%m-%y")
head(reviews$`Date Flown`)
## [1] "03-23" "04-23" "03-23" "04-23" "04-23" "04-23"
# Identifying the missing values
cat("There are", sum(is.na(reviews$Aircraft)), "missing data values in 'Aircraft' column.\n")
## There are 1614 missing data values in 'Aircraft' column.
cat("There are", sum(is.na(reviews$Type.Of.Traveller)), "missing data values in 'Type of traveller' column.\n")
## There are 177 missing data values in 'Type of traveller' column.
cat("There are", sum(is.na(reviews$Seat.Type)), "missing data values in 'Seat Type' column.\n")
## There are 0 missing data values in 'Seat Type' column.
cat("There are", sum(is.na(reviews$`Date Flown`)), "missing data values in 'Date Flown' column.\n")
## There are 178 missing data values in 'Date Flown' column.
cat("There are", sum(is.na(reviews$Recommended)), "missing data values in 'Recommended' column.\n")
## There are 0 missing data values in 'Recommended' column.
cat("There are", sum(is.na(reviews$`Seat Comfort`)), "missing data values in 'Seat Comfort' column.\n")
## There are 0 missing data values in 'Seat Comfort' column.
cat("There are", sum(is.na(reviews$`Cabin Staff Service`)), "missing data values in 'Cabin Staff Services' column.\n")
## There are 0 missing data values in 'Cabin Staff Services' column.
cat("There are", sum(is.na(reviews$`Food & Beverages`)), "missing data values in 'Food & Beverages' column.\n")
## There are 0 missing data values in 'Food & Beverages' column.
cat("There are", sum(is.na(reviews$`Inflight Entertainment`)), "missing data values in 'Inflight Entertainment' column.\n")
## There are 0 missing data values in 'Inflight Entertainment' column.
cat("There are", sum(is.na(reviews$`Ground Service`)), "missing data values in 'Ground Service' column.\n")
## There are 0 missing data values in 'Ground Service' column.
cat("There are", sum(is.na(reviews$`Wifi & Connectivity`)), "missing data values in 'Wifi & Connectivity' column.\n")
## There are 0 missing data values in 'Wifi & Connectivity' column.
cat("There are", sum(is.na(reviews$`Value For Money`)), "missing data values in 'Value for Money' column.\n")
## There are 0 missing data values in 'Value for Money' column.
cat("There are", sum(is.na(reviews$Reviews)), "missing data values in 'Reviews' column.\n")
## There are 0 missing data values in 'Reviews' column.
cat("There are", sum(is.na(reviews$Verification)), "missing data values in 'Verification' column.\n")
## There are 5 missing data values in 'Verification' column.
cat("There are", sum(is.na(reviews$Origin)), "missing data values in 'Origin' column.\n")
## There are 187 missing data values in 'Origin' column.
cat("There are", sum(is.na(reviews$Destination)), "missing data values in 'Destination' column.\n")
## There are 191 missing data values in 'Destination' column.
# Removing the 'Aircraft Column' since it doesn't contain any valuable information or there are 1621 missing values.
reviews <- reviews[,-1]
# Replacing the NAs in 'Type of Traveller' to random values from "Solo Leisure", "Couple Leisure", "Family Leisure" and "Business".
reviews$Type.Of.Traveller <- ifelse(is.na(reviews$Type.Of.Traveller), sample(c("Solo Leisure", "Business", "Couple Leisure", "Family Leisure"), 1), reviews$Type.Of.Traveller)
reviews$Seat.Type <- ifelse(is.na(reviews$Seat.Type), sample(c("Economy Class", "First Class", "Premium Economy", "Business Class"),1), reviews$Seat.Type)
# Replacing the NAs in 'Date Flown' to random values between April 2015 to September 2022
months <- seq(from = ymd("2015-04-01"), to = ymd("2022-09-01"), by = "months")
months_str <- format(months, "%m-%y")
reviews$`Date Flown`[is.na(reviews$`Date Flown`)] <- sample(months_str, sum(is.na(reviews$`Date Flown`)), replace = TRUE)
# Replacing both NA values to "Not Verified"
reviews$Verification[is.na(reviews$Verification)] = "Not Verified"
# Origin and Destination NA values cleaning
reviews <- reviews[complete.cases(reviews[c("Origin", "Destination")]),]
# Destination additional locations cleaning
reviews$Final.Destination <- gsub("(.*via\\s)(\\w+)(\\s\\&\\s)(\\w+)(.*)", "\\2, \\4", reviews$Destination)
reviews$Final.Destination <- gsub("(.*via\\s)(\\w+)(.*)", "\\2", reviews$Final.Destination)
reviews <- reviews[,-16]
# Replace specific destination values in Final.Destination column
reviews$Final.Destination <- ifelse(grepl("New", reviews$Final.Destination), "New York", ifelse(grepl("Salt", reviews$Final.Destination), "Salt Lake City",reviews$Final.Destination))
cat("There are", sum(is.na(reviews$Type.Of.Traveller)), "missing data values in 'Type of traveller' column.\n")
## There are 0 missing data values in 'Type of traveller' column.
cat("There are", sum(is.na(reviews$Seat.Type)), "missing data values in 'Seat Type' column.\n")
## There are 0 missing data values in 'Seat Type' column.
cat("There are", sum(is.na(reviews$Route)), "missing data values in 'Route' column.\n")
## There are 0 missing data values in 'Route' column.
cat("There are", sum(is.na(reviews$`Date Flown`)), "missing data values in 'Date Flown' column.\n")
## There are 0 missing data values in 'Date Flown' column.
cat("There are", sum(is.na(reviews$Recommended)), "missing data values in 'Recommended' column.\n")
## There are 0 missing data values in 'Recommended' column.
cat("There are", sum(is.na(reviews$`Seat Comfort`)), "missing data values in 'Seat Comfort' column.\n")
## There are 0 missing data values in 'Seat Comfort' column.
cat("There are", sum(is.na(reviews$`Cabin Staff Service`)), "missing data values in 'Cabin Staff Services' column.\n")
## There are 0 missing data values in 'Cabin Staff Services' column.
cat("There are", sum(is.na(reviews$`Food & Beverages`)), "missing data values in 'Food & Beverages' column.\n")
## There are 0 missing data values in 'Food & Beverages' column.
cat("There are", sum(is.na(reviews$`Inflight Entertainment`)), "missing data values in 'Inflight Entertainment' column.\n")
## There are 0 missing data values in 'Inflight Entertainment' column.
cat("There are", sum(is.na(reviews$`Ground Service`)), "missing data values in 'Ground Service' column.\n")
## There are 0 missing data values in 'Ground Service' column.
cat("There are", sum(is.na(reviews$`Wifi & Connectivity`)), "missing data values in 'Wifi & Connectivity' column.\n")
## There are 0 missing data values in 'Wifi & Connectivity' column.
cat("There are", sum(is.na(reviews$`Value For Money`)), "missing data values in 'Value for Money' column.\n")
## There are 0 missing data values in 'Value for Money' column.
cat("There are", sum(is.na(reviews$Reviews)), "missing data values in 'Reviews' column.\n")
## There are 0 missing data values in 'Reviews' column.
cat("There are", sum(is.na(reviews$Verification)), "missing data values in 'Verification' column.\n")
## There are 0 missing data values in 'Verification' column.
cat("There are", sum(is.na(reviews$Origin)), "missing data values in 'Origin' column.\n")
## There are 0 missing data values in 'Origin' column.
cat("There are", sum(is.na(reviews$Destination)), "missing data values in 'Destination' column.\n")
## There are 0 missing data values in 'Destination' column.
summary(reviews)
## Type.Of.Traveller Seat.Type Date Flown Recommended
## Length:1968 Length:1968 Length:1968 Length:1968
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :2.174 Mean :2.173 Mean :2.218 Mean :2.181
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Ground Service Wifi & Connectivity Value For Money Reviews
## Min. :1.000 Min. :1.000 Min. :1.000 Length:1968
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 Class :character
## Median :2.000 Median :2.000 Median :2.000 Mode :character
## Mean :2.148 Mean :2.173 Mean :2.177
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Total Rating Verification Origin Final.Destination
## Min. :1.000 Length:1968 Length:1968 Length:1968
## 1st Qu.:1.714 Class :character Class :character Class :character
## Median :2.286 Mode :character Mode :character Mode :character
## Mean :2.178
## 3rd Qu.:2.714
## Max. :3.429
dim(reviews)
## [1] 1968 16
Total_ratings_Business <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Business"){
Total_ratings_Business <- Total_ratings_Business + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Business
## [1] 5915
Total_ratings_Couple <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Couple Leisure"){
Total_ratings_Couple <- Total_ratings_Couple + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Couple
## [1] 6856
Total_ratings_Family <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Family Leisure"){
Total_ratings_Family <- Total_ratings_Family + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Family
## [1] 7297
Total_ratings_Solo <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Solo Leisure"){
Total_ratings_Solo <- Total_ratings_Solo + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Solo
## [1] 9933
#Word Cloud
# install.packages("wordcloud")
# library(wordcloud)
# install.packages("RColorBrewer")
# library(RColorBrewer)
# install.packages("tm")
# library(tm)
# install.packages("openNLP")
# install.packages("rJava")
# #library(openNLP)
# install.packages("NLP")
# library(NLP)
#
# # Define a function to extract only adjectives from the text data
# adj_extractor <- function(text) {
# word_tokenizer <- Maxent_Word_Token_Annotator()
# pos_tag_annotator <- Maxent_POS_Tag_Annotator()
# text <- as.String(text)
# annotations <- NLP::annotate(text, list(word_tokenizer, pos_tag_annotator))
# adjectives <- subset(annotations$POS, Type == "JJ")
# adj_words <- text[adjectives]
# return(adj_words)
# }
#
# # Apply the adjective extractor to the text data
# text_data <- reviews$Reviews
# text_data <- gsub("flights", "", text_data)
# text_data <- gsub("plane", "", text_data)
# text_data <- gsub("airport", "", text_data)
# text_data <- gsub("get", "", text_data)
# text_data <- gsub("airline", "", text_data)
# text_data <- gsub("airlines", "", text_data)
# text_data <- gsub("flight", "", text_data)
#
# adj_text_data <- sapply(text_data, adj_extractor)
# adj_text_data <- unlist(adj_text_data)
# adj_text_data <- gsub("[^[:alnum:][:space:]]*", "", adj_text_data)
#
# docs <- Corpus(VectorSource(adj_text_data))
#
# docs <- docs %>%
# tm_map(removeNumbers) %>%
# tm_map(removePunctuation) %>%
# tm_map(stripWhitespace)
#
# dtm <- TermDocumentMatrix(docs)
# matrix <- as.matrix(dtm)
# words <- sort(rowSums(matrix), decreasing=TRUE)
# df <- data.frame(word = names(words), freq = words)
#
# set.seed(1234)
# wordcloud(
# words = df$word,
# freq = df$freq,
# min.freq = 1,
# max.words = 200,
# random.order = FALSE,
# rot.per = 0.35,
# colors = brewer.pal(8, "Dark2")
# )
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("tm")
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
text_data <- reviews$Reviews
text_data <- gsub("flights","",text_data)
text_data <- gsub("plane", "", text_data)
text_data <- gsub("airport", "", text_data)
text_data <- gsub("get","", text_data)
text_data <- gsub("airline", "", text_data)
text_data <- gsub("airlines","", text_data)
text_data <- gsub("flight", "", text_data)
docs <- Corpus(VectorSource(text_data))
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
set.seed(1234)
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
library(ggplot2)
ggplot(reviews, aes(x = `Type.Of.Traveller`, y = `Total Rating`, col=`Type.Of.Traveller`)) +
geom_col() +
xlab("Type of Traveler") +
ylab("Total Ratings") +
ggtitle("By travel type")
library(ggplot2)
ggplot(reviews, aes(x = `Seat.Type`, y = `Total Rating`, col=`Seat.Type`)) +
geom_col() +
xlab("Seat Type") +
ylab("Total Ratings") +
ggtitle("By Seat type")
library(ggplot2)
ggplot(reviews, aes(x = `Verification`, y = `Total Rating`, col=`Total Rating`)) +
geom_col() +
xlab("Seat Type") +
ylab("Total Ratings") +
ggtitle("By Verification")
type_counts <- table(reviews$Type.Of.Traveller)
type_df <- data.frame(Type.Of.Traveller = names(type_counts),
count = as.numeric(type_counts))
ggplot(type_df, aes(x = "", y = count, fill = Type.Of.Traveller)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
scale_fill_discrete(name = "Type of traveller") +
ggtitle("Type of traveler distribution") +
theme(plot.title = element_text(hjust = 0.5),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
geom_text(aes(label = count), position = position_stack(vjust = 0.5))
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
corr_data <- cor(reviews[, c("Seat Comfort", "Cabin Staff Service", "Food & Beverages",
"Inflight Entertainment", "Ground Service", "Wifi & Connectivity")])
melted_corr <- melt(corr_data)
ggplot(melted_corr, aes(x = Var1, y = value, fill = Var2)) +
geom_col() +
scale_fill_discrete(name = "Amenities") +
theme_minimal() +
ggtitle("Correlation between Amenities") +
xlab("Amenities") +
ylab("Correlation") +
theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.position = "bottom")
library(ggplot2)
# create the plot
ggplot(reviews, aes(x = Verification, fill = Recommended)) +
geom_bar() +
scale_fill_discrete(name = "Recommended") +
theme_minimal() +
ggtitle("Verification vs Recommended") +
xlab("Verification") +
ylab("Count") +
theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.position = "bottom")
# # Create a box plot for Verification vs Total Ratings
reviews %>% ggplot(aes(Verification, `Total Rating`)) + geom_boxplot() + geom_point(alpha = 0.5, aes(size = `Total Rating`, color = reviews$`Type Of Traveler`))
#Correlation plot between ameneties
library(corrplot)
## corrplot 0.92 loaded
amenities_df <- reviews[, c(5:11)]
col <- colorRampPalette(c("Black", "lightblue", "lightgreen"))
corrplot(
cor(amenities_df),
method = "ellipse",
col = col(500),
addCoef.col = "black",
tl.col = "black"
)
## Warning in ind1:ind2: numerical expression has 2 elements: only the first used
pairs(amenities_df, main = "Ameneties", pch = 21, bg = c("#CFB87C"))
library(ggplot2)
ggplot(reviews, aes(x = `Date Flown`, fill = Recommended)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = c("yes" = "green", "no" = "red")) +
labs(x = "Date Flown", y = "Count", fill = "Recommendation")
# K-Means
# install.packages("factoextra")
# load required libraries
# Load required libraries
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Prerequisites
Reviews.Labels <- reviews$Type.Of.Traveller
table(Reviews.Labels)
## Reviews.Labels
## Business Couple Leisure Family Leisure Solo Leisure
## 386 460 473 649
reviews_kmeans_clustering <- reviews[5:10]
head(reviews_kmeans_clustering)
## Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## 1 1 2 3 1
## 2 2 3 1 2
## 3 1 2 3 1
## 4 1 2 3 1
## 5 3 1 1 1
## 6 1 1 1 2
## Ground Service Wifi & Connectivity
## 1 2 3
## 2 3 1
## 3 2 1
## 4 1 1
## 5 1 1
## 6 3 4
# Scale the data
reviews_kmeans_clustering_scale <- scale(reviews_kmeans_clustering)
# Calculate K for K-Means using Elbow Method and Silhouette Method
fviz_nbclust(reviews_kmeans_clustering_scale, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + labs(subtitle = "Elbow Method")
fviz_nbclust(reviews_kmeans_clustering_scale, kmeans, method = "silhouette") + labs(subtitle = "Silhouette Method")
# Calculate Distance
reviews.dist <- dist(reviews_kmeans_clustering_scale)
# Perform K-Means
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 4, nstart = 1000)
print(km.out)
## K-means clustering with 4 clusters of sizes 897, 231, 434, 406
##
## Cluster means:
## Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## 1 -0.4127647 -0.50050225 -0.5482564 -0.5569192
## 2 0.3345114 1.12412374 1.8558250 -0.9156574
## 3 1.2273702 0.39093024 -0.5959371 0.1491683
## 4 -0.5903961 0.04831086 0.7924312 1.5919565
## Ground Service Wifi & Connectivity
## 1 -0.5015258 -0.4017139
## 2 -0.2820461 0.2425646
## 3 0.8787580 1.2184140
## 4 0.3291632 -0.5529230
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1 1 1 1 1 1 1 1 1 2 1 3 1 1 4 1
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 2 1 1 1 1 1 3 2 4 4 3 1 3 2 3 4
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 1 1 1 1 1 3 1 3 4 1 1 1 1 1 1
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 1 1 1 4 2 4 2 3 1 1 3 4 3 3 1 1
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 3 4 3 4 1 2 1 1 3 4 3 4 2 4
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 2 3 4 1 1 1 1 1 3 4 3 1 2 1 2 1
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 1 3 1 1 1 1 1 1 1 1 3 4 3 4 3 2
## 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 4 3 4 3 1 1 1 1 1 2 3 4 2 2 3 2
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 3 1 3 3 4 3 4 3 3 4 2 1 4 2 3 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 3 1 1 1 1 1 2 3 4 3 4 4 3 1 3 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
## 1 1 3 4 3 2 1 1 3 4 3 4 2 1 1 1
## 177 178 179 180 181 182 183 184 185 186 187 188 189 191 192 193
## 1 1 1 1 1 4 1 1 1 1 4 3 1 1 1 1
## 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
## 1 1 1 1 3 4 2 3 2 1 1 1 1 4 1 3
## 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
## 1 4 1 1 3 1 3 4 3 1 1 4 3 4 3 4
## 226 227 228 229 230 231 232 233 234 235 236 237 238 240 241 242
## 1 4 4 3 2 3 4 3 4 2 4 2 3 1 3 1
## 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
## 3 1 4 3 4 3 4 3 3 2 1 1 1 1 4 3
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
## 3 4 3 2 3 4 1 1 3 4 2 3 4 1 1 1
## 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
## 1 3 4 3 4 1 1 1 3 2 2 1 1 4 4 3
## 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 4 3 1 4 2 3 4 3 3 3 1 1 4 4 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
## 1 1 1 4 3 4 2 3 4 2 1 3 4 3 3 1
## 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
## 3 1 1 1 1 1 3 3 3 4 2 3 4 2 4 1
## 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
## 1 1 4 4 1 3 4 2 3 2 1 1 1 4 1 1
## 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
## 1 4 1 1 3 3 4 1 1 2 1 1 1 1 1 3
## 371 372 373 374 375 376 377 378 379 380 382 383 384 385 386 387
## 1 1 1 1 4 3 3 4 1 1 1 1 4 3 4 2
## 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
## 1 3 1 4 3 4 2 3 4 4 2 3 2 4 4 1
## 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
## 3 1 1 1 1 4 1 1 3 2 4 3 4 2 3 4
## 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
## 4 4 1 1 1 2 1 1 1 1 2 3 4 3 4 3
## 436 437 439 440 441 442 443 444 445 446 447 448 449 450 451 452
## 4 1 1 3 1 1 3 2 3 3 1 4 2 1 4 3
## 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 4 2 3 1 1 1 1 1 1 1 1 1 1 3 2 1
## 469 470 471 472 473 474 475 477 478 479 480 481 482 483 484 485
## 3 4 3 3 1 1 3 2 1 4 3 4 2 3 1 1
## 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
## 1 1 2 3 1 3 2 3 4 1 1 3 1 1 1 1
## 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
## 3 1 1 1 4 1 1 2 4 3 2 3 1 3 4 2
## 518 519 520 521 522 523 524 525 527 528 529 530 531 532 533 534
## 4 3 1 1 1 1 3 4 3 2 4 1 4 3 4 3
## 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
## 3 4 3 4 1 1 1 1 1 4 3 4 2 3 1 4
## 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 567
## 2 4 1 4 2 2 2 4 2 3 1 1 3 4 2 1
## 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
## 4 3 1 3 1 1 3 4 2 3 4 3 1 1 1 1
## 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
## 2 1 2 3 4 3 4 2 3 4 3 1 1 1 3 1
## 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
## 1 1 4 4 3 4 1 1 1 1 1 1 1 1 4 1
## 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
## 1 1 3 4 2 3 4 4 1 1 1 1 1 1 3 3
## 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
## 4 4 2 4 4 1 1 4 3 4 2 1 1 1 2 3
## 648 649 650 651 652 653 655 656 657 658 659 660 661 662 663 664
## 4 3 1 1 1 1 2 1 2 3 3 4 1 1 1 4
## 665 666 667 668 669 670 671 672 673 674 675 677 678 679 680 681
## 3 4 2 1 1 1 1 3 2 1 2 4 3 4 1 1
## 682 683 684 685 686 688 689 690 691 692 693 694 695 696 697 698
## 1 1 3 4 3 1 3 1 1 1 1 1 1 1 3 2
## 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
## 1 1 1 1 1 1 1 3 4 4 1 3 1 1 1 1
## 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
## 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
## 1 1 1 1 1 1 1 1 3 1 2 3 1 1 1 4
## 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
## 2 1 1 4 3 1 1 1 1 1 4 3 4 2 3 1
## 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
## 1 1 1 1 1 1 1 1 4 1 1 2 3 4 3 4
## 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
## 2 1 1 1 1 2 1 1 1 1 2 4 1 1 1 2
## 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
## 4 3 3 1 1 1 4 3 3 4 1 1 1 1 2 3
## 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
## 3 2 2 3 4 3 4 1 1 1 4 1 4 1 3 2
## 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842
## 4 1 1 1 3 3 1 1 4 3 1 2 1 4 2 1
## 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
## 3 4 3 4 2 1 1 1 4 2 1 3 2 3 1 4
## 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874
## 1 1 1 1 1 3 1 1 1 2 4 3 4 1 2 4
## 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
## 1 1 2 4 1 1 2 3 2 3 4 1 1 1 1 2
## 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
## 3 3 2 1 3 4 3 4 3 4 2 3 4 3 4 1
## 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
## 1 2 3 1 1 1 1 4 2 3 3 1 1 1 3 4
## 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
## 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1
## 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
## 1 3 4 3 4 3 1 1 1 1 1 1 3 2 1 1
## 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
## 3 1 1 1 1 1 1 1 1 1 1 1 1 4 4 2
## 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
## 1 1 1 1 1 1 3 2 3 2 1 2 4 2 4 3
## 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
## 1 1 4 3 1 1 1 3 4 4 1 1 4 3 4 1
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
## 1 1 1 1 1 2 3 3 3 4 3 4 3 1 1 1
## 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
## 1 1 1 1 1 1 3 2 4 3 4 2 3 4 1 3
## 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
## 3 4 3 1 1 1 1 1 1 3 1 1 1 2 2 1
## 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
## 1 1 1 1 1 1 1 2 1 3 4 2 3 1 3 4
## 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082
## 3 1 1 1 2 1 1 1 1 1 3 1 1 1 1 4
## 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
## 1 1 1 1 1 1 1 4 1 3 4 3 2 3 2 3
## 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
## 4 3 2 4 1 4 1 1 1 1 1 1 1 3 4 2
## 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
## 4 4 2 2 1 1 1 1 2 1 1 3 2 1 1 2
## 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
## 1 2 4 4 4 4 1 3 4 3 4 3 1 1 1 1
## 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
## 3 4 3 2 3 1 2 2 1 1 4 3 4 2 1 3
## 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
## 3 4 4 4 1 1 1 4 2 3 4 1 4 1 1 4
## 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
## 1 1 2 3 3 1 1 3 3 4 3 2 3 4 3 4
## 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
## 2 3 4 3 1 1 1 1 3 1 1 3 1 2 4 4
## 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
## 1 1 1 4 2 3 4 3 1 3 2 1 1 1 2 3
## 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
## 4 3 4 1 1 2 3 3 4 3 2 3 4 1 3 1
## 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
## 3 3 1 1 1 1 4 1 2 1 1 1 4 3 4 2
## 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
## 3 1 1 1 4 4 3 4 2 2 4 1 4 1 4 3
## 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
## 1 2 4 1 1 1 3 1 3 2 1 1 1 1 1 1
## 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
## 1 1 1 1 1 1 1 3 1 3 1 3 2 4 1 1
## 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322
## 1 1 4 1 3 3 4 1 1 1 3 1 1 1 1 4
## 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1338 1339
## 1 1 1 1 4 1 4 1 1 1 1 1 1 1 3 4
## 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
## 3 4 4 4 4 1 1 1 1 1 1 1 1 3 1 4
## 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
## 4 1 1 4 3 1 1 1 1 1 1 1 1 1 1 1
## 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
## 4 1 1 3 1 3 1 1 1 2 1 1 1 1 4 2
## 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
## 4 1 3 1 1 1 1 2 4 3 1 1 1 1 2 4
## 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
## 3 4 2 4 1 1 1 1 4 1 1 3 1 2 1 3
## 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
## 4 2 1 1 1 1 4 4 1 1 1 2 1 1 4 1
## 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451
## 2 3 4 1 1 4 3 4 3 1 2 4 1 2 3 4
## 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467
## 1 1 1 1 1 4 1 1 1 1 1 3 1 1 1 1
## 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
## 2 1 1 1 2 3 4 3 1 1 4 1 1 1 1 4
## 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
## 3 4 2 1 1 1 1 1 2 3 4 3 1 1 4 3
## 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515
## 4 3 4 3 4 3 4 3 3 3 2 3 3 4 1 1
## 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531
## 3 2 3 4 3 3 4 1 1 1 1 1 1 1 3 1
## 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
## 1 1 1 4 1 1 1 2 3 1 1 1 1 3 3 4
## 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
## 1 3 4 3 4 3 4 1 1 3 4 3 2 3 4 3
## 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579
## 4 2 1 1 1 1 4 1 3 1 2 1 3 4 3 4
## 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
## 2 1 1 1 4 2 2 2 3 4 3 4 3 1 2 3
## 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611
## 1 4 3 1 1 1 1 1 4 1 3 2 3 4 4 2
## 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627
## 4 3 3 1 3 4 2 3 4 3 4 2 3 4 3 1
## 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
## 1 1 4 2 1 3 3 4 1 4 2 4 3 1 1 3
## 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659
## 3 4 1 2 3 4 3 2 1 4 1 3 4 4 1 1
## 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675
## 2 3 2 1 1 4 4 1 1 1 1 1 1 1 4 2
## 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
## 4 4 4 3 1 4 2 1 3 4 3 1 1 1 4 2
## 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
## 3 4 4 2 1 3 1 3 3 1 3 4 3 1 1 4
## 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723
## 3 2 1 2 1 1 3 1 4 3 4 1 3 2 4 4
## 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739
## 1 3 3 4 1 4 3 3 4 1 1 1 1 1 1 3
## 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1753 1754 1755 1756
## 4 4 2 3 2 3 4 1 1 1 1 1 4 3 4 3
## 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
## 4 3 1 3 1 1 1 1 4 3 2 4 1 1 2 3
## 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
## 4 4 3 4 2 1 1 4 4 1 3 4 2 3 4 1
## 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804
## 3 4 1 4 1 1 1 1 1 1 2 1 1 1 1 1
## 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820
## 1 1 1 2 4 3 2 2 1 1 1 4 2 3 4 2
## 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836
## 3 1 4 3 2 3 1 4 1 1 1 1 1 1 1 3
## 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852
## 4 3 3 4 2 3 4 3 4 2 1 1 1 1 4 3
## 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868
## 4 2 2 3 4 4 1 1 1 1 3 3 3 4 3 4
## 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
## 2 3 4 3 4 2 3 4 3 4 1 2 1 3 4 2
## 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
## 4 4 4 3 3 4 2 3 4 3 1 3 4 1 1 4
## 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916
## 3 4 3 4 2 4 3 1 3 3 2 3 4 3 4 2
## 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932
## 4 1 1 3 1 4 1 3 1 1 2 3 4 2 3 3
## 1933 1934 1935 1936 1937 1938 1939 1940 1942 1943 1944 1945 1946 1947 1948 1949
## 4 3 4 2 1 3 4 3 1 3 4 2 1 1 1 4
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
## 1 1 1 1 4 2 3 4 1 1 1 1 1 1 1 1
## 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1977 1978 1979 1980 1981 1982
## 1 1 1 4 3 4 2 3 3 4 3 4 2 3 4 2
##
## Within cluster sum of squares by cluster:
## [1] 2148.4354 225.1447 2014.8653 1321.0365
## (between_SS / total_SS = 51.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# 51.6 % classification or identification of an observation with a group - could be better.
# Creating a new dataframe to assign the values of Type of Traveler
reviews_tot <- reviews[1]
names(reviews_tot) <- c("Type.Of.Traveler")
reviews_tot$Type.Of.Traveler <- substr(reviews_tot$Type.Of.Traveler, 1, 1)
rownames(reviews_kmeans_clustering_scale) <- paste(reviews_tot$Type.Of.Traveler, 1:dim(reviews_tot)[1], sep = "_")
head(reviews_kmeans_clustering_scale)
## Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## B_1 -0.9268708 -0.1363212 0.6099306 -0.9156574
## S_2 -0.1372250 0.6527378 -0.9499698 -0.1402642
## C_3 -0.9268708 -0.1363212 0.6099306 -0.9156574
## C_4 -0.9268708 -0.1363212 0.6099306 -0.9156574
## S_5 0.6524207 -0.9253802 -0.9499698 -0.9156574
## F_6 -0.9268708 -0.9253802 -0.9499698 -0.1402642
## Ground Service Wifi & Connectivity
## B_1 -0.1174966 0.6526826
## S_2 0.6743981 -0.9262718
## C_3 -0.1174966 -0.9262718
## C_4 -0.9093913 -0.9262718
## S_5 -0.9093913 -0.9262718
## F_6 0.6743981 1.4421598
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 3, nstart = 100)
km.clusters <- km.out$clusters
p1 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=3")
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 4, nstart = 100)
km.clusters <- km.out$clusters
p2 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=4")
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 7, nstart = 100)
km.clusters <- km.out$clusters
p3 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=7")
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 8, nstart = 100)
km.clusters <- km.out$clusters
p4 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=8")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1,p2,p3,p4, nrow=2)
# Prerequisites
Reviews.Labels <- reviews$Type.Of.Traveller
table(Reviews.Labels)
## Reviews.Labels
## Business Couple Leisure Family Leisure Solo Leisure
## 386 460 473 649
reviews_hierarchical_clustering <- reviews[5:11]
# Scale the data
reviews_hierarchical_clustering_scale <- scale(reviews_hierarchical_clustering)
# Distance Calculating using Cosine Similarity
cosine_matrix <- as.matrix(reviews_hierarchical_clustering_scale)
reviews_cosine_distance <- 1 - crossprod(cosine_matrix) / sqrt(colSums(cosine_matrix^2) %*% t(colSums(cosine_matrix^2)))
hc.out_reviews <- hclust(as.dist(reviews_cosine_distance), method = "ward.D")
# hc.out_reviews
hierarchical.dist <- as.dist(reviews_cosine_distance)
# Dendogram
hc.out_reviews <- hclust(hierarchical.dist, method = "ward.D")
plot(hc.out_reviews)
rect.hclust(hc.out_reviews, k=3, border = 2:5)
#install.packages("arules")
#install.packages("arulesViz")
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:tm':
##
## inspect
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
reviews_apriori <- reviews[, c(1,2,4,13)]
summary(reviews_apriori)
## Type.Of.Traveller Seat.Type Recommended Total Rating
## Length:1968 Length:1968 Length:1968 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.714
## Mode :character Mode :character Mode :character Median :2.286
## Mean :2.178
## 3rd Qu.:2.714
## Max. :3.429
rule1 <- apriori(reviews_apriori, parameter=list(suppor = 0.002, confidence = 0.5))
## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.002 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 1968 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [167 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(rule1, by = "lift"), 20))
## lhs rhs support confidence coverage lift count
## [1] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Total Rating=[2.57,3.43]} => {Recommended=yes} 0.003048780 1.0000000 0.003048780 3.352641 6
## [2] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=First Class,
## Total Rating=[2.57,3.43]} => {Recommended=yes} 0.006097561 0.6315789 0.009654472 2.117457 12
## [3] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Recommended=yes} => {Total Rating=[2.57,3.43]} 0.003048780 0.7500000 0.004065041 2.058577 6
## [4] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[1,1.86)} => {Recommended=yes} 0.003048780 0.6000000 0.005081301 2.011584 6
## [5] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Recommended=no} => {Total Rating=[1,1.86)} 0.002032520 0.5714286 0.003556911 1.955776 4
## [6] {Type.Of.Traveller=Business,
## Seat.Type=Business Class,
## Total Rating=[1,1.86)} => {Recommended=yes} 0.002032520 0.5714286 0.003556911 1.915795 4
## [7] {Type.Of.Traveller=Family Leisure,
## Seat.Type=First Class,
## Recommended=yes} => {Total Rating=[2.57,3.43]} 0.002032520 0.6666667 0.003048780 1.829847 4
## [8] {Type.Of.Traveller=Couple Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[1,1.86)} => {Recommended=yes} 0.003556911 0.5384615 0.006605691 1.805268 7
## [9] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class} => {Recommended=yes} 0.004065041 0.5333333 0.007621951 1.788075 8
## [10] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=First Class} => {Recommended=yes} 0.010670732 0.5121951 0.020833333 1.717206 21
## [11] {Type.Of.Traveller=Family Leisure,
## Seat.Type=First Class,
## Total Rating=[2.57,3.43]} => {Recommended=yes} 0.002032520 0.5000000 0.004065041 1.676320 4
## [12] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=First Class,
## Total Rating=[1,1.86)} => {Recommended=yes} 0.002540650 0.5000000 0.005081301 1.676320 5
## [13] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Recommended=yes} => {Total Rating=[1.86,2.57)} 0.004573171 0.5625000 0.008130081 1.637574 9
## [14] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=First Class,
## Recommended=yes} => {Total Rating=[2.57,3.43]} 0.006097561 0.5714286 0.010670732 1.568440 12
## [15] {Seat.Type=First Class,
## Recommended=yes,
## Total Rating=[2.57,3.43]} => {Type.Of.Traveller=Solo Leisure} 0.006097561 0.5000000 0.012195122 1.516179 12
## [16] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Total Rating=[1,1.86)} => {Recommended=no} 0.002032520 1.0000000 0.002032520 1.425054 4
## [17] {Type.Of.Traveller=Family Leisure,
## Seat.Type=First Class,
## Total Rating=[1,1.86)} => {Recommended=no} 0.004065041 1.0000000 0.004065041 1.425054 8
## [18] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Recommended=no} => {Total Rating=[2.57,3.43]} 0.007621951 0.5172414 0.014735772 1.419709 15
## [19] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007621951 0.9375000 0.008130081 1.335988 15
## [20] {Type.Of.Traveller=Family Leisure,
## Recommended=yes,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.014227642 0.9655172 0.014735772 1.241109 28
plot(rule1)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rule1, method = "grouped")
inspect(head(sort(rule1, by = "confidence"), 20))
## lhs rhs support confidence coverage lift count
## [1] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Total Rating=[1,1.86)} => {Recommended=no} 0.002032520 1.0000000 0.002032520 1.425054 4
## [2] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Business Class,
## Total Rating=[2.57,3.43]} => {Recommended=yes} 0.003048780 1.0000000 0.003048780 3.352641 6
## [3] {Type.Of.Traveller=Family Leisure,
## Seat.Type=First Class,
## Total Rating=[1,1.86)} => {Recommended=no} 0.004065041 1.0000000 0.004065041 1.425054 8
## [4] {Type.Of.Traveller=Family Leisure,
## Recommended=yes,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.014227642 0.9655172 0.014735772 1.241109 28
## [5] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007621951 0.9375000 0.008130081 1.335988 15
## [6] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.057926829 0.8837209 0.065548780 1.135965 114
## [7] {Type.Of.Traveller=Solo Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.060467480 0.8686131 0.069613821 1.116545 119
## [8] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.056910569 0.8682171 0.065548780 1.116036 112
## [9] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.055386179 0.8650794 0.064024390 1.112003 109
## [10] {Type.Of.Traveller=Family Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.155995935 0.8647887 0.180386179 1.111629 307
## [11] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.071646341 0.8545455 0.083841463 1.098462 141
## [12] {Type.Of.Traveller=Family Leisure} => {Seat.Type=Economy Class} 0.205284553 0.8541226 0.240345528 1.097919 404
## [13] {Type.Of.Traveller=Family Leisure,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.078252033 0.8461538 0.092479675 1.087675 154
## [14] {Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.170731707 0.8358209 0.204268293 1.074393 336
## [15] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.041158537 0.8350515 0.049288618 1.073404 81
## [16] {Type.Of.Traveller=Solo Leisure,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.085365854 0.8316832 0.102642276 1.069074 168
## [17] {Type.Of.Traveller=Couple Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.042174797 0.8300000 0.050813008 1.066911 83
## [18] {Type.Of.Traveller=Solo Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.182926829 0.8275862 0.221036585 1.063808 360
## [19] {Type.Of.Traveller=Solo Leisure,
## Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.057926829 0.8260870 0.070121951 1.061881 114
## [20] {Type.Of.Traveller=Couple Leisure,
## Seat.Type=First Class,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007113821 0.8235294 0.008638211 1.173574 14
inspect(head(sort(rule1, by = "support"), 20))
## lhs rhs support confidence coverage lift count
## [1] {} => {Seat.Type=Economy Class} 0.7779472 0.7779472 1.0000000 1.0000000 1531
## [2] {} => {Recommended=no} 0.7017276 0.7017276 1.0000000 1.0000000 1381
## [3] {Recommended=no} => {Seat.Type=Economy Class} 0.5579268 0.7950760 0.7017276 1.0220180 1098
## [4] {Seat.Type=Economy Class} => {Recommended=no} 0.5579268 0.7171783 0.7779472 1.0220180 1098
## [5] {Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.2799797 0.7684798 0.3643293 0.9878303 551
## [6] {Type.Of.Traveller=Solo Leisure} => {Seat.Type=Economy Class} 0.2652439 0.8043143 0.3297764 1.0338933 522
## [7] {Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.2616870 0.7618343 0.3434959 0.9792880 515
## [8] {Total Rating=[2.57,3.43]} => {Recommended=no} 0.2535569 0.6959554 0.3643293 0.9917742 499
## [9] {Total Rating=[1.86,2.57)} => {Recommended=no} 0.2439024 0.7100592 0.3434959 1.0118729 480
## [10] {Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.2362805 0.8086957 0.2921748 1.0395252 465
## [11] {Type.Of.Traveller=Solo Leisure} => {Recommended=no} 0.2210366 0.6702619 0.3297764 0.9551597 435
## [12] {Recommended=yes} => {Seat.Type=Economy Class} 0.2200203 0.7376491 0.2982724 0.9481994 433
## [13] {Type.Of.Traveller=Family Leisure} => {Seat.Type=Economy Class} 0.2052846 0.8541226 0.2403455 1.0979186 404
## [14] {Total Rating=[1,1.86)} => {Recommended=no} 0.2042683 0.6991304 0.2921748 0.9962988 402
## [15] {Recommended=no,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.1986789 0.7835671 0.2535569 1.0072241 391
## [16] {Seat.Type=Economy Class,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.1986789 0.7096189 0.2799797 1.0112454 391
## [17] {Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.1885163 0.7729167 0.2439024 0.9935336 371
## [18] {Seat.Type=Economy Class,
## Total Rating=[1.86,2.57)} => {Recommended=no} 0.1885163 0.7203883 0.2616870 1.0265925 371
## [19] {Type.Of.Traveller=Solo Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.1829268 0.8275862 0.2210366 1.0638077 360
## [20] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Economy Class} => {Recommended=no} 0.1829268 0.6896552 0.2652439 0.9827961 360
rule2 <- apriori(reviews_apriori, parameter=list(suppor = 0.005, confidence = 0.75))
## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 1968 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [49 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(rule2, by = "lift"), 20))
## lhs rhs support confidence coverage lift count
## [1] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007621951 0.9375000 0.008130081 1.335988 15
## [2] {Type.Of.Traveller=Family Leisure,
## Recommended=yes,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.014227642 0.9655172 0.014735772 1.241109 28
## [3] {Type.Of.Traveller=Couple Leisure,
## Seat.Type=First Class,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007113821 0.8235294 0.008638211 1.173574 14
## [4] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.057926829 0.8837209 0.065548780 1.135965 114
## [5] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Economy Class,
## Total Rating=[1.86,2.57)} => {Recommended=no} 0.056910569 0.7943262 0.071646341 1.131958 112
## [6] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.005589431 0.7857143 0.007113821 1.119686 11
## [7] {Type.Of.Traveller=Solo Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.060467480 0.8686131 0.069613821 1.116545 119
## [8] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.056910569 0.8682171 0.065548780 1.116036 112
## [9] {Type.Of.Traveller=Business,
## Seat.Type=First Class,
## Total Rating=[1.86,2.57)} => {Recommended=no} 0.009146341 0.7826087 0.011686992 1.115260 18
## [10] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1.86,2.57)} => {Recommended=no} 0.065548780 0.7818182 0.083841463 1.114133 129
## [11] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.055386179 0.8650794 0.064024390 1.112003 109
## [12] {Type.Of.Traveller=Family Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.155995935 0.8647887 0.180386179 1.111629 307
## [13] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.071646341 0.8545455 0.083841463 1.098462 141
## [14] {Type.Of.Traveller=Family Leisure} => {Seat.Type=Economy Class} 0.205284553 0.8541226 0.240345528 1.097919 404
## [15] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1,1.86)} => {Recommended=no} 0.049288618 0.7698413 0.064024390 1.097066 97
## [16] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Premium Economy} => {Recommended=no} 0.011686992 0.7666667 0.015243902 1.092542 23
## [17] {Type.Of.Traveller=Family Leisure,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.078252033 0.8461538 0.092479675 1.087675 154
## [18] {Type.Of.Traveller=Family Leisure,
## Seat.Type=Economy Class} => {Recommended=no} 0.155995935 0.7599010 0.205284553 1.082900 307
## [19] {Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.022357724 0.7586207 0.029471545 1.081076 44
## [20] {Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.170731707 0.8358209 0.204268293 1.074393 336
inspect(head(sort(rule2, by = "confidence"), 20))
## lhs rhs support confidence coverage lift count
## [1] {Type.Of.Traveller=Family Leisure,
## Recommended=yes,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.014227642 0.9655172 0.014735772 1.241109 28
## [2] {Type.Of.Traveller=Solo Leisure,
## Seat.Type=Premium Economy,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007621951 0.9375000 0.008130081 1.335988 15
## [3] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.057926829 0.8837209 0.065548780 1.135965 114
## [4] {Type.Of.Traveller=Solo Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.060467480 0.8686131 0.069613821 1.116545 119
## [5] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.056910569 0.8682171 0.065548780 1.116036 112
## [6] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.055386179 0.8650794 0.064024390 1.112003 109
## [7] {Type.Of.Traveller=Family Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.155995935 0.8647887 0.180386179 1.111629 307
## [8] {Type.Of.Traveller=Family Leisure,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.071646341 0.8545455 0.083841463 1.098462 141
## [9] {Type.Of.Traveller=Family Leisure} => {Seat.Type=Economy Class} 0.205284553 0.8541226 0.240345528 1.097919 404
## [10] {Type.Of.Traveller=Family Leisure,
## Total Rating=[2.57,3.43]} => {Seat.Type=Economy Class} 0.078252033 0.8461538 0.092479675 1.087675 154
## [11] {Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.170731707 0.8358209 0.204268293 1.074393 336
## [12] {Type.Of.Traveller=Family Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.041158537 0.8350515 0.049288618 1.073404 81
## [13] {Type.Of.Traveller=Solo Leisure,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.085365854 0.8316832 0.102642276 1.069074 168
## [14] {Type.Of.Traveller=Couple Leisure,
## Recommended=no,
## Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.042174797 0.8300000 0.050813008 1.066911 83
## [15] {Type.Of.Traveller=Solo Leisure,
## Recommended=no} => {Seat.Type=Economy Class} 0.182926829 0.8275862 0.221036585 1.063808 360
## [16] {Type.Of.Traveller=Solo Leisure,
## Recommended=no,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.057926829 0.8260870 0.070121951 1.061881 114
## [17] {Type.Of.Traveller=Couple Leisure,
## Seat.Type=First Class,
## Total Rating=[2.57,3.43]} => {Recommended=no} 0.007113821 0.8235294 0.008638211 1.173574 14
## [18] {Type.Of.Traveller=Family Leisure,
## Recommended=yes} => {Seat.Type=Economy Class} 0.049288618 0.8220339 0.059959350 1.056671 97
## [19] {Total Rating=[1,1.86)} => {Seat.Type=Economy Class} 0.236280488 0.8086957 0.292174797 1.039525 465
## [20] {Type.Of.Traveller=Family Leisure,
## Recommended=yes,
## Total Rating=[1.86,2.57)} => {Seat.Type=Economy Class} 0.014735772 0.8055556 0.018292683 1.035489 29
plot(rule2)
plot(rule2, method = "grouped")
#Naive Bayesian
# Load required libraries
library(e1071)
## Registered S3 methods overwritten by 'proxy':
## method from
## print.registry_field registry
## print.registry_entry registry
library(caTools)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
library(gridExtra)
library(class)
reviews <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)
# Select relevant features
features <- c("Type.Of.Traveler", "Seat.Type", "Seat.Comfort", "Cabin.Staff.Services", "Food.Beverages", "Inflight.Entertainment", "Ground.Service", "Wifi", "Value.For.Money")
# Create formula
formula <- as.formula(paste("Recommended ~", paste(features, collapse = "+")))
# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(reviews$Recommended, p = 0.8, list = FALSE)
trainData <- reviews[trainIndex, ]
testData <- reviews[-trainIndex, ]
# Train Naive Bayes classifier
nb_model <- naiveBayes(formula, data = trainData)
# Make predictions on test data
predictions <- predict(nb_model, testData[, features])
testData$Recommended <- as.factor(testData$Recommended)
# Evaluate performance of model
conf_matrix <- confusionMatrix(predictions, testData$Recommended)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 271 113
## yes 4 1
##
## Accuracy : 0.6992
## 95% CI : (0.651, 0.7444)
## No Information Rate : 0.7069
## P-Value [Acc > NIR] : 0.6541
##
## Kappa : -0.008
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.985455
## Specificity : 0.008772
## Pos Pred Value : 0.705729
## Neg Pred Value : 0.200000
## Prevalence : 0.706941
## Detection Rate : 0.696658
## Detection Prevalence : 0.987147
## Balanced Accuracy : 0.497113
##
## 'Positive' Class : no
##
conf_df <- as.data.frame(conf_matrix$table)
plot_conf_matrix <- ggplot(data = conf_df, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(label = Freq)) +
theme_minimal() +
labs(x = "Actual", y = "Predicted", title = "Confusion Matrix")
print(plot_conf_matrix)
write.csv(reviews, "/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", row.names=FALSE)
#Decision Trees
mydataset <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)
mydataset$Total.Ratings <- as.integer(mydataset$Total.Ratings)
mydataset <- mydataset[,-c(3,12,13,15,16)]
str(mydataset)
## 'data.frame': 1952 obs. of 11 variables:
## $ Type.Of.Traveler : Factor w/ 4 levels "Business","Couple Leisure",..: 3 2 2 2 1 1 4 4 1 3 ...
## $ Seat.Type : Factor w/ 4 levels "Business Class",..: 2 2 3 2 3 2 2 2 2 4 ...
## $ Recommended : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 2 1 1 ...
## $ Seat.Comfort : int 1 2 1 1 4 2 4 1 2 1 ...
## $ Cabin.Staff.Services : int 2 3 1 2 5 3 5 2 1 2 ...
## $ Food.Beverages : int 3 1 1 3 1 4 1 3 1 1 ...
## $ Inflight.Entertainment: int 1 2 1 4 2 5 2 4 2 2 ...
## $ Ground.Service : int 2 3 1 1 3 1 3 5 3 3 ...
## $ Wifi : int 3 1 1 2 4 2 4 1 1 4 ...
## $ Value.For.Money : int 1 2 1 3 1 3 5 1 1 5 ...
## $ Verification : Factor w/ 2 levels "Not Verified",..: 2 2 2 1 2 1 1 1 2 2 ...
##Splitting data into training and testing set
set.seed(123) # set a seed for reproducibility
library(caTools) # load the necessary library
options <- c("yes", "no")
# use the sample function to randomly assign "yes" or "no" to each observation since there is a lot of negative feedback in our dataframe than positive, which leads to poor decision tree. Although, the new values may not give effective result, it will produce a decision tree at least.
mydataset$Recommended <- sample(options, size = nrow(mydataset), replace = TRUE)
split_data <- sample.split(mydataset, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train_data <- subset(mydataset, split_data == TRUE) # create the training set
test_data <- subset(mydataset, split_data == FALSE) # create the testing set
write.csv(train_data, "/Users/rahul.chauhan/Desktop/train_data.csv", row.names=FALSE)
write.csv(test_data, "/Users/rahul.chauhan/Desktop/test_data.csv", row.names=FALSE)
##First Model
library(rpart)
model1 <- rpart(Recommended ~ ., data = train_data)
# check if the decision tree has more than one node
if(nrow(model1$frame) > 1) {
plot(model1, margin = 0.1)
text(model1, cex = 0.7)
} else {
message("The decision tree has only one node.")
}
pr1 <- predict(model1, newdata = test_data, type="class")
confusionMatrix(table(pred = pr1, true = test_data$Recommended))
## Confusion Matrix and Statistics
##
## true
## pred no yes
## no 47 49
## yes 136 122
##
## Accuracy : 0.4774
## 95% CI : (0.4243, 0.5308)
## No Information Rate : 0.5169
## P-Value [Acc > NIR] : 0.9384
##
## Kappa : -0.0292
##
## Mcnemar's Test P-Value : 2.568e-10
##
## Sensitivity : 0.2568
## Specificity : 0.7135
## Pos Pred Value : 0.4896
## Neg Pred Value : 0.4729
## Prevalence : 0.5169
## Detection Rate : 0.1328
## Detection Prevalence : 0.2712
## Balanced Accuracy : 0.4851
##
## 'Positive' Class : no
##
library(rpart.plot)
pruned_tree <- prune.rpart(model1, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)
rpart.plot(model1, box.palette="RdGn", shadow.col="gray", nn=TRUE)
##Second Model
model2 <- rpart(Recommended ~ Type.Of.Traveler + Seat.Type + Verification, data = train_data)
# check if the decision tree has more than one node
if(nrow(model2$frame) > 1) {
plot(model2, margin = 0.1)
text(model2, cex = 0.7)
} else {
message("The decision tree has only one node.")
}
pr2 <- predict(model2, newdata = test_data, type="class")
confusionMatrix(table(pred = pr2, true = test_data$Recommended))
## Confusion Matrix and Statistics
##
## true
## pred no yes
## no 47 58
## yes 136 113
##
## Accuracy : 0.452
## 95% CI : (0.3993, 0.5055)
## No Information Rate : 0.5169
## P-Value [Acc > NIR] : 0.9938
##
## Kappa : -0.0811
##
## Mcnemar's Test P-Value : 3.234e-08
##
## Sensitivity : 0.2568
## Specificity : 0.6608
## Pos Pred Value : 0.4476
## Neg Pred Value : 0.4538
## Prevalence : 0.5169
## Detection Rate : 0.1328
## Detection Prevalence : 0.2966
## Balanced Accuracy : 0.4588
##
## 'Positive' Class : no
##
pruned_tree <- prune.rpart(model2, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)
rpart.plot(model2, box.palette="RdGn", shadow.col="gray", nn=TRUE)
##Third Model
set.seed(1234)
options <- c("Verified", "Not Verified")
mydataset$Verification <- sample(options, size = nrow(mydataset), replace = TRUE)
split_data <- sample.split(mydataset, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train_data <- subset(mydataset, split_data == TRUE) # create the training set
test_data <- subset(mydataset, split_data == FALSE) # create the testing set
model3 <- rpart(Verification ~ ., data = train_data)
# check if the decision tree has more than one node
if(nrow(model3$frame) > 1) {
plot(model3, margin = 0.1)
text(model3, cex = 0.7)
} else {
message("The decision tree has only one node.")
}
pr3 <- predict(model3, newdata = test_data, type="class")
confusionMatrix(table(pred = pr3, true = test_data$Verification))
## Confusion Matrix and Statistics
##
## true
## pred Not Verified Verified
## Not Verified 98 97
## Verified 72 88
##
## Accuracy : 0.5239
## 95% CI : (0.4706, 0.5769)
## No Information Rate : 0.5211
## P-Value [Acc > NIR] : 0.47912
##
## Kappa : 0.0518
##
## Mcnemar's Test P-Value : 0.06487
##
## Sensitivity : 0.5765
## Specificity : 0.4757
## Pos Pred Value : 0.5026
## Neg Pred Value : 0.5500
## Prevalence : 0.4789
## Detection Rate : 0.2761
## Detection Prevalence : 0.5493
## Balanced Accuracy : 0.5261
##
## 'Positive' Class : Not Verified
##
pruned_tree <- prune.rpart(model3, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)
rpart.plot(model3, box.palette="RdGn", shadow.col="gray", nn=TRUE)
#SVM
library(e1071)
#We need only labeled numeric data for SVMs, let's look at our dataframe to check for numeric labelled entries. Also, check for data that can be converted to numeric type.
reviews <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)
# Here, we can see that we have "Seat Comfort", "Cabin Staff Services", "Food & Beverages", "Inflight Entertainment", "Ground Service", "Wifi", "Value for money", "Total Ratings" which are labeled and numeric.
# We can also convert "Verification", "Type of Traveler, "Seat Type" and "Recommended" to numeric.
# "Origin", "Destination", "Reviews", "Date flown" cannot be used for SVMs.
reviews_svm <- as.data.frame(reviews)
reviews_svm <- reviews_svm[ ,-c(3,12,14,15,16)]
sapply(reviews_svm, class)
## Type.Of.Traveler Seat.Type Recommended
## "factor" "factor" "factor"
## Seat.Comfort Cabin.Staff.Services Food.Beverages
## "integer" "integer" "integer"
## Inflight.Entertainment Ground.Service Wifi
## "integer" "integer" "integer"
## Value.For.Money Total.Ratings
## "integer" "numeric"
reviews_svm$Type.Of.Traveler <- as.numeric(reviews$Type.Of.Traveler)
reviews_svm$Seat.Type <- as.numeric(reviews_svm$Seat.Type)
sapply(reviews_svm, class)
## Type.Of.Traveler Seat.Type Recommended
## "numeric" "numeric" "factor"
## Seat.Comfort Cabin.Staff.Services Food.Beverages
## "integer" "integer" "integer"
## Inflight.Entertainment Ground.Service Wifi
## "integer" "integer" "integer"
## Value.For.Money Total.Ratings
## "integer" "numeric"
str(reviews_svm)
## 'data.frame': 1952 obs. of 11 variables:
## $ Type.Of.Traveler : num 3 2 2 2 1 1 4 4 1 3 ...
## $ Seat.Type : num 2 2 3 2 3 2 2 2 2 4 ...
## $ Recommended : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 2 1 1 ...
## $ Seat.Comfort : int 1 2 1 1 4 2 4 1 2 1 ...
## $ Cabin.Staff.Services : int 2 3 1 2 5 3 5 2 1 2 ...
## $ Food.Beverages : int 3 1 1 3 1 4 1 3 1 1 ...
## $ Inflight.Entertainment: int 1 2 1 4 2 5 2 4 2 2 ...
## $ Ground.Service : int 2 3 1 1 3 1 3 5 3 3 ...
## $ Wifi : int 3 1 1 2 4 2 4 1 1 4 ...
## $ Value.For.Money : int 1 2 1 3 1 3 5 1 1 5 ...
## $ Total.Ratings : num 1.86 2 1 2.29 2.86 ...
head(reviews_svm)
## Type.Of.Traveler Seat.Type Recommended Seat.Comfort Cabin.Staff.Services
## 1 3 2 no 1 2
## 2 2 2 yes 2 3
## 3 2 3 no 1 1
## 4 2 2 no 1 2
## 5 1 3 no 4 5
## 6 1 2 no 2 3
## Food.Beverages Inflight.Entertainment Ground.Service Wifi Value.For.Money
## 1 3 1 2 3 1
## 2 1 2 3 1 2
## 3 1 1 1 1 1
## 4 3 4 1 2 3
## 5 1 2 3 4 1
## 6 4 5 1 2 3
## Total.Ratings
## 1 1.857143
## 2 2.000000
## 3 1.000000
## 4 2.285714
## 5 2.857143
## 6 2.857143
##Linear Kernel with traditional method
library(e1071)
set.seed(123)
split_data <- sample.split(reviews_svm, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train <- subset(reviews_svm, split_data == TRUE) # create the training set
test <- subset(reviews_svm, split_data == FALSE) # create the testing set
svm_linear <- svm(Recommended ~ ., data = train, kernel = "linear")
svm_linear_pred <- predict(svm_linear, test)
svm_linear_cm <- confusionMatrix(svm_linear_pred, test$Recommended)
svm_linear_accuracy <- svm_linear_cm$overall["Accuracy"]
print("Confusion matrix for SVM with linear kernel:")
## [1] "Confusion matrix for SVM with linear kernel:"
print(svm_linear_cm$table)
## Reference
## Prediction no yes
## no 240 115
## yes 0 0
print(paste("Accuracy:", svm_linear_accuracy))
## [1] "Accuracy: 0.676056338028169"
# plot the SVM with linear kernel
plot(svm_linear, reviews_svm, Total.Ratings ~ Type.Of.Traveler)
##Polynomial Kernel with traditional method
svm_poly <- svm(Recommended ~ ., data = train, kernel = "polynomial", degree = 3)
svm_poly_pred <- predict(svm_poly, test)
svm_poly_cm <- confusionMatrix(svm_poly_pred, test$Recommended)
svm_poly_accuracy <- svm_poly_cm$overall["Accuracy"]
print("Confusion matrix for SVM with polynomial kernel:")
## [1] "Confusion matrix for SVM with polynomial kernel:"
print(svm_poly_cm$table)
## Reference
## Prediction no yes
## no 240 115
## yes 0 0
print(paste("Accuracy:", svm_poly_accuracy))
## [1] "Accuracy: 0.676056338028169"
plot(svm_poly, reviews_svm, Total.Ratings ~ Type.Of.Traveler)
##Radial Kernel with traditional method
svm_radial <- svm(Recommended ~ ., data = train, kernel = "radial")
svm_radial_pred <- predict(svm_radial, test)
svm_radial_cm <- confusionMatrix(svm_radial_pred, test$Recommended)
svm_radial_accuracy <- svm_radial_cm$overall["Accuracy"]
print("Confusion matrix for SVM with radial kernel:")
## [1] "Confusion matrix for SVM with radial kernel:"
print(svm_radial_cm$table)
## Reference
## Prediction no yes
## no 240 115
## yes 0 0
print(paste("Accuracy:", svm_radial_accuracy))
## [1] "Accuracy: 0.676056338028169"
plot(svm_radial, reviews_svm, Total.Ratings ~ Type.Of.Traveler)
#All the kernel using three different costs for each
library(e1071)
set.seed(42)
ind <- sample(2, nrow(reviews_svm), replace = TRUE, prob = c(0.9, 0.1))
train_data <- reviews_svm[ind == 1,]
test_data <- reviews_svm[ind == 2,]
write.csv(train_data, "/Users/rahul.chauhan/Desktop/train_data_svm.csv", row.names=FALSE)
write.csv(test_data, "/Users/rahul.chauhan/Desktop/test_data_svm.csv", row.names=FALSE)
kernels <- c("linear", "polynomial", "radial")
costs <- c(0.1, 1, 10)
fit_svm <- function(kernel, cost) {
model <- svm(Recommended ~ ., data = train_data, kernel = kernel, cost = cost)
preds <- predict(model, test_data)
accuracy <- sum(preds == test_data$Recommended) / length(preds)
cm <- table(Predicted = preds, Actual = test_data$Recommended)
plot(model, train_data, Seat.Type ~ Seat.Comfort, main = paste("Kernel:", kernel, ", Cost:", cost))
cat("Kernel:", kernel, ", Cost:", cost, "\n")
cat("Accuracy:", accuracy, "\n")
cat("Confusion Matrix:\n")
print(cm)
cat("\n")
}
for (kernel in kernels) {
for (cost in costs) {
fit_svm(kernel, cost)
}
}
## Kernel: linear , Cost: 0.1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: linear , Cost: 1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: linear , Cost: 10
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: polynomial , Cost: 0.1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: polynomial , Cost: 1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: polynomial , Cost: 10
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: radial , Cost: 0.1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: radial , Cost: 1
## Accuracy: 0.7156398
## Confusion Matrix:
## Actual
## Predicted no yes
## no 151 60
## yes 0 0
## Kernel: radial , Cost: 10
## Accuracy: 0.7061611
## Confusion Matrix:
## Actual
## Predicted no yes
## no 147 58
## yes 4 2